In [425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

file1 = '/Users/fizza/path//to/Gly-data y1 copy.csv'
file2 = '/Users/fizza/path//to/Gly-data y2.csv'
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
data = pd.concat([df1, df2], ignore_index=True)

def clean_target(val):
    if str(val).strip() in ['xx', 'xx', 'xx']:
        return 0.01
    try:
        return float(val)
    except:
        return np.nan

data['y1'] = data['y1'].apply(clean_target)
data['y2'] = data['y2'].apply(clean_target)
data = data.dropna(subset=['y1', 'y2'], how='all')
data['y1'] = data['y1'].fillna(0.01)
data['y2'] = data['y2'].fillna(0.01)

X = data.drop(columns=['y1', 'y2'])
y1 = data['y1']
bool_cols = X.select_dtypes(include='bool').columns.tolist()
X[bool_cols] = X[bool_cols].astype(str)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X[categorical_cols] = X[categorical_cols].fillna('missing').astype(str)
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, categorical_cols),
    ('num', num_pipeline, numerical_cols)
])

# Models
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}

X_train, X_val, y1_train, y1_val = train_test_split(X, y1, test_size=0.2, random_state=42)

results = []
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y1_train)
    y1_train_pred = pipeline.predict(X_train)
    y1_val_pred = pipeline.predict(X_val)

    train_r2 = r2_score(y1_train, y1_train_pred)
    val_r2 = r2_score(y1_val, y1_val_pred)
    train_rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)
    val_rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)

    results.append({
        'Model': name,
        'Train R²': train_r2,
        'Validation R²': val_r2,
        'Train RMSE': train_rmse,
        'Validation RMSE': val_rmse
    })
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df.melt(id_vars='Model', value_vars=['Train R²', 'Validation R²']),
            x='Model', y='value', hue='variable')
plt.title('A-Ratio: R² Scores for Train and Validation')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.legend(title='Dataset', loc='lower right')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [433]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    AdaBoostRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}

fig, axes = plt.subplots(5, 2, figsize=(12, 18), dpi=600)
axes = axes.flatten()

for ax, (name, model) in zip(axes, models.items()):
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y1_train)
    y1_val_pred = pipeline.predict(X_val)

    r2 = r2_score(y1_val, y1_val_pred)
    rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)

    ax.scatter(y1_val, y1_val_pred, alpha=0.6, edgecolors='k')
    ax.plot([y1_val.min(), y1_val.max()], [y1_val.min(), y1_val.max()], 'r--')
    ax.set_title(name)
    ax.set_xlabel('True α-ratio')
    ax.set_ylabel('Predicted α-ratio')
    ax.legend([f"R² = {r2:.2f}\nRMSE = {rmse:.2f}"], loc='lower right')

plt.tight_layout()
plt.suptitle("Validation: Actual vs Predicted α-ratio for All Models", fontsize=18, y=1.02)
plt.show()
No description has been provided for this image
In [435]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    AdaBoostRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}

fig, axes = plt.subplots(5, 2, figsize=(12, 18), dpi=600)
axes = axes.flatten()

for ax, (name, model) in zip(axes, models.items()):
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y1_train)
    y1_train_pred = pipeline.predict(X_train)

    r2 = r2_score(y1_train, y1_train_pred)
    rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)

    ax.scatter(y1_train, y1_train_pred, alpha=0.6, edgecolors='k')
    ax.plot([y1_train.min(), y1_train.max()], [y1_train.min(), y1_train.max()], 'r--')
    ax.set_title(name)
    ax.set_xlabel('True α-ratio')
    ax.set_ylabel('Predicted α-ratio')
    ax.legend([f"R² = {r2:.2f}\nRMSE = {rmse:.2f}"], loc='lower right')

plt.tight_layout()
plt.suptitle("Train: Actual vs Predicted α-ratio for All Models", fontsize=18, y=1.02)
plt.show()
No description has been provided for this image